

* Get details of gaps in the signatory records
clear
import delimited data/signatory_record_gap_details.csv, delim(",")
format beg_date end_date %tdD_m_CY

la var city "Bank branch"
la var beg_date "Beginning Date of Block"
la var end_date "Ending Date of Block"
la var beg_acct "First account # in valid block"
la var end_acct "Last account # in valid block"
la var roll "Film roll #"
la var roll_block "Block within film roll"
la var valid_record_sequences "Ordered sequence within city records"

compress

save working/all_blocks, replace


* Generate file for mapping to valid date sequences


clear
import delimited data/family_search_valid_dates.csv, delim(",")
format beg_date end_date %tdD_m_CY
sort city date_seq_no

* Note that family_search_valid_dates.csv edits out the early odd-lot records from the 2nd DC roll. The odd-lot early number books from the 2nd DC roll will be included, but since they share overlap with the other books, it is best to remove the date series.  This has the effect of omitting information from 28may1872 to 31dec1872 when the next book begins, which contains only the odd lot records from this section.

save working/valid_sequences, replace


* Create valid_bank_dates to get all dates (including those which may not have a sequential record, i.e. no account record that day)
clear
use working/valid_sequences
* Create a time series of valid dates

gen days = end_date-beg_date + 1
expand days
drop days

bys city beg_date: gen date = beg_date + _n -1
format date %td
drop beg_date end_date

save working/valid_bank_dates, replace



********************************
********************************
* CLEAN THE FAMILY SEARCH DATA *
********************************
********************************


clear
import delimited "data/United_States_Freedmans_Bank_Records_1865-1874_CID1417695.txt", delim("|") varnames(1)

la var account_number "Account Number"
la var application_date_std "Application Date of Account"
la var application_year "Year of Account"
la var bank_location "Location of Bank (formal description , may contain county)"
la var digital_gs_number "Image Directory Location"
la var film_number "Internal film record number (same id level as digital_gs_number)"
la var fs_pr_ark "Direct URL for person"
la var fs_record_ark "Record url for person (or abstract entity) - URL is a redirect that may be invalid"
la var fs_sp_ark "Direct URL for spouse (link contains cross-information about spouse)"
la var image_id "Unique identifier for each film image"
la var image_nbr "Number of film image within collection"
la var occupation "Occupation (if recorded)"
la var pr_name "Name of person"
la var pr_name_gn "Given name of person"
la var pr_name_surn "Surname of person"
la var relation_to_acct_holder "Relationship to Account Holder"
la var relation_to_acct_holder_std "Relationship to Account Holder (standardized)"
la var residence "Residence (usually descriptive)"
la var sort_key "Unique person identifier + #09 (see unique_identifier)"
la var spouse_name "Name of spouse"
la var spouse_name_gn "Given name of spouse"
la var spouse_name_surn "Surname of spouse"
la var spouse_uid "Unique numerical identifier for spouse"
la var system_addedsort_key "Unique sort key (different form from sort_key, but also unique)"
la var unique_identifier "Unique numerical identifier for person (unique to record)"
la var age_in_years "Age in years (standardized, numeric)"
la var birth_place "Place of birth (usually recorded as city/county and/or state)"
la var pr_age "Age as described in text of record"
la var where_brought_up "Where brought up (usually more descriptive than birth_place)"
la var fs_pr_fthr_ark "Direct URL for father (link contains cross-information about father)"
la var fs_pr_mthr_ark "Direct URL for mother (link contains cross-information about mother)"
la var fthr_name "Name of father"
la var fthr_name_gn "Given name of father"
la var fthr_name_surn "Surname of father"
la var fthr_uid "Unique numerical identifier for father"
la var mthr_name "Name of mother"
la var mthr_name_gn "Given name of mother"
la var mthr_uid "Unique numerical identifier for mother"
la var remarks "Remarks which don't fit into any category"
la var see_record "Reference to previous account"
la var mthr_name_surn "Surname of mother"
la var works_for "Name of employer"
la var complexion "Complexion/skin color (not standarized)"
la var pr_name_titles_terms "Title (Mrs/Sr/Jr/Dr/Hon etc)"
la var regiment_company "Regiment or Company if soldier"
la var plantation "Plantation if originally from one (usually blank)"


* Drop unchanging variables
drop event_type fs_collection_id fs_record_version fs_vis_status pr_is_principal
* Drop variables with almost no valid nonmissing information
drop death_place death_date_std death_year image_type birth_date_std birth_year mthr_is_principal spouse_is_principal fthr_is_principal
* Drop all missing primary urls
drop if missing(fs_pr_ark)
* Drop variables representing non-primary url
drop fs_record_ark fs_sp_ark fs_pr_fthr_ark fs_pr_mthr_ark 
* Drop redundant record identifiers
drop image_id sort_key system_addedsort_key
* Drop family member record (all are simply referalls to existing records)
foreach relation in "fthr" "mthr" "spouse" {
	drop `relation'_name
	drop `relation'_name_gn
	drop `relation'_name_surn
	drop `relation'_uid
}


* Merge in list of valid block sequences
tempfile temp
save `temp'

use city block_id digital_gs_number end_image_nbr beg_image_nbr roll using working/all_blocks

* This is a big join, but shouldn't be too onerous (<2.5GB)
joinby digital_gs_number using `temp', unmatched(both)
assert _merge==3
drop _merge
keep if inrange(image_nbr,beg_image_nbr, end_image_nbr)
qui duplicates report unique_identifier
assert r(unique_value)==r(N)

* Identify City and Film Roll Number

la var roll "Film Roll Number"
la var block_id "Unique identifier for contiguous blocks of accounts"





***********************
* Fix account numbers *
***********************

gen double booknum = real(account_number)

* Fix account numbers with fractions like 1/2 
replace booknum = real(regexs(1)+".5") if regexm(account_number,"([0-9]+) 1[/\\][12]")
replace booknum = real(regexs(1)+".5") if regexm(account_number,"^([0-9]+)[/\-]2$")
replace booknum = real(regexs(1)+".25") if regexm(account_number,"([0-9]+) 1[/\\]4")
replace booknum = real(regexs(1)+".75") if regexm(account_number,"([0-9]+) 3[/\\]?4")
replace booknum = 7253.5 if account_number == "725..."
replace booknum = 5300.5 if account_number == "..." & unique_identifier ==353928407 
replace booknum = 2130.5 if account_number== "2130 1\\2" & digital_gs_number == 4098533 
replace booknum = real(regexs(1)) if regexm(lower(account_number),"([0-9]+) *a") & inlist(image_nbr, 307, 549, 1123, 1131) & digital_gs_number == 4098534

* Replace A suffix with .5 and c suffix with ,75
replace booknum = real(regexs(1)+".5") if regexm(account_number,"([0-9]+)A")
replace booknum = real(regexs(1)+".75") if regexm(account_number,"([0-9]+)c")

* Replace dual account numbers with a single account (Only a 4 or 5 of these)
replace booknum = real(regexs(1)) if regexm(account_number,"([0-9]+) &")
replace booknum = real(regexs(1)) if regexm(account_number,"([0-9]+), ")


* Fix 4 D.C. records with wonky account numbers
replace booknum = real(regexs(1)) if regexm(account_number,"^([0-9]+)") & digital_gs_number== 4098148 & inrange(image_nbr,9,10)

* Temporarily replace booknum with blanks for A-H
foreach letter in A B C D E F G H {
	local mlet = lower("`letter'")
	replace booknum = .`mlet' if account_number=="`letter'"
}

* Drop final missing booknums from Richmond which begin the record or were never opened
drop if mi(booknum) & digital_gs_number == 4098534
assert booknum !=.


* Create Sort Key
* Preserve first record according to certain criteria
* (1) Is account date attached to record
* (2) Does the record have the fewest nonmissing information variables
* (3) Is the relation_to_acct_holder_std == "Organization"
* (4) Is the relation_to_acct_holder_std == "Self"
* (5) Break ties with unique_identifier
gen date_recorded = cond(!mi(application_date_std),1,2)
egen nonmissing_info = rowmiss(occupation residence age_in_years birth_place where_brought_up works_for remarks  complexion regiment_company plantation)
gen org = cond(relation_to_acct_holder_std =="Organization",1,2)
gen self = cond(relation_to_acct_holder_std =="Self",1,2)


sort city block_id image_nbr booknum date_recorded nonmissing_info org self unique_identifier
gen long record_order = _n
by city block_id image_nbr booknum (date_recorded nonmissing_info org self unique_identifier): gen byte acct_sort = _n
gen byte acct_prim = acct_sort==1
drop date_recorded nonmissing_info org self

* Create Unique "Account Number ID". Note the "missing" booknums will group all ordered missing booknums .a,.b,.c, etc
assert booknum != .
egen acct_id = group(city block_id image_nbr booknum), missing




* Variable housekeeping
order city acct_id acct_sort acct_prim account_number booknum see_record application_date_std  application_year  fs_pr_ark image_nbr pr_name  relation_to_acct_holder relation_to_acct_holder_std residence  age_in_years birth_place pr_age  remarks   record_order   complexion occupation where_brought_up works_for  regiment_company pr_name_titles_terms plantation pr_name_gn pr_name_surn bank_location digital_gs_number film_number unique_identifier

* Format string variables to make them a bit eaier to read in column
format pr_name_gn pr_name_surn %10s
format pr_age pr_name_titles_terms  %15s
format relation_to_acct_holder relation_to_acct_holder_std complexion plantation regiment_company %25s
format occupation pr_name pr_name_gn works_for residence %35s
format birth_place where_brought_up remarks regiment_company plantation %50s

la var city "City of Branch"
la var acct_sort "Sort order of persons in account"
la var acct_prim "Primary person for account"
la var acct_id "Unique created ID for all accounts" /* group(city block_id image_nbr booknum) */
la var booknum "Account number (numeric)"
la var record_order "Unique sort order for all records" /*  city - block_id - image_nbr - booknum - date_recorded - nonmissing_info - self - unique_identifier */



*******************************************************************
* Create Indicator for a Transfer Record (i.e. not a new account) *
*******************************************************************

* Create "Referring Record field"
gen byte passbook_transfer = 0
* In general if see_record exists and is less than account_number, it represents a passbook change. This is not usually recorded in the remarks field
gen ref_account = real(regexs(1)) if regexm(see_record,"([0-9]+)")
replace passbook_transfer = 1 if !missing(see_record) & ref_account<real(account_number) & !mi(real(account_number))
drop ref_account
* If reference account contains elipses, declare to be a transfer
replace passbook_transfer = 1 if see_record =="..."

* If the account has a letter or a 1/2, it is usually a transfer or a note of a joint account
replace passbook_transfer = 1 if regexm(account_number,"1/2|[A-Z]")

* Select regular expressions represent notes that indicate the account is a new passbook rather than a new account
replace passbook_transfer = 1 if regexm(lower(remarks),"(new|old) (pass)?book")
replace passbook_transfer = 1 if regexm(lower(remarks),"(lost .*book|book .*lost)")
replace passbook_transfer = 1 if regexm(lower(remarks),"^book no")
replace passbook_transfer = 1 if regexm(lower(remarks),"trans[a-z\.]+ from [0-9]")
replace passbook_transfer = 1 if regexm(lower(remarks),"^see (fo[a-z\.]|old|a/c|rec)")
replace passbook_transfer = 1 if regexm(lower(remarks),"^from.*[0-9]")
replace passbook_transfer = 1 if regexm(lower(remarks),"^trans[^ ]+") ///
	& !regexm(lower(remarks),"^transfer[a-z]* to")
replace passbook_transfer = 1 if regexm(lower(remarks),"^transfer[red]*$")
replace passbook_transfer = 1 if regexm(lower(remarks),"^old (no|A/c|dep)")
replace passbook_transfer = 1 if regexm(lower(remarks),"^cont(\.|inu)")
replace passbook_transfer = 1 if regexm(lower(remarks),"^record ([0-9]|same)")
replace passbook_transfer = 1 if regexm(lower(remarks),"^no\. [0-9]+ trans")
replace passbook_transfer = 1 if regexm(lower(remarks),"^[0-9]+[\.'']* ?(|cont)$")
replace passbook_transfer = 1 if regexm(lower(remarks), "deposited (before|see)")



* All Records in the Washington D.C. branch that are logged as of January 1 on any year are not new accounts, but new passbooks. Small number of records in other cities appear to be valid new accounts.
replace passbook_transfer = 1 if substr(application_date_std ,1,2)=="01" & substr(application_date_std ,4,3)=="Jan" & city == "Washington"

* Charleston images 139-186 contain a large discontinuous set of housekeeping items which all appear to not be valid new accounts.
replace passbook_transfer = 1 if ( digital_gs_number == 4098532 &  inrange(image_nbr ,139,189) )| inlist(unique_identifier,354189534,354189535,354189536,354189537,354189948,354189949,354189950)

* Mark housekeeping accounts which are transfers to other branches
replace passbook_transfer = 1 if regexm(pr_name, "B\. B\.")
replace passbook_transfer = 1 if regexm(pr_name ,"^(Beaufort|Jacksonville|Tallahassee|Charleston|Raleigh|New York|Philadelphia|Savannah|Wilmington|Augusta|Vicksburg|Baltimore|Atlanta|Huntsville|Memphis|New Orleans|Richmond|Charleston S\.C\.|Macon|Mobile|N. Y.|Natches|Newbern|Norfolk|Tallahasse) Branch$")


* Check to make sure all encoded missing booknums (lettered accounts) are encoded as transfers
assert passbook_transfer == 1 if  booknum>=.a

* Generate an indicator for whether this is a referring record (i.e. a new passbook or update of old account)
egen referring_record = max(passbook_transfer), by(acct_id)


* Generate indicator for Organization
gen _temp = regexm(lower(relation_to_acct_holder_std), "organization|committee|treasurer|president|chairman|officer")
egen organization = max(_temp), by(acct_id)
drop _temp

save working/family_search_full, replace


******************************************************************
* Reduce records to only acct_prim  "Primary person for account" *
******************************************************************
use working/family_search_full, clear
sort acct_id
keep if acct_prim ==1
compress
assert acct_sort == 1
qui duplicates report acct_id
assert r(N) == r(unique_value)
drop acct_sort acct_prim



**************************
* Generate and fix dates *
**************************

gen long int_date = date(application_date_std,"DMY")
format int_date %td


gen d1 = regexs(1) if regexm(application_date_std,"([0-9]+) ?([A-z][A-z][A-z])? ?([0-9][0-9][0-9][0-9])? or +")
gen m1 = regexs(2) if regexm(application_date_std,"([0-9]+)? ?([A-z][A-z][A-z]) ?([0-9][0-9][0-9][0-9])? or +")
gen y1 = regexs(3) if regexm(application_date_std,"([0-9]+)? ?([A-z][A-z][A-z])? ?([0-9][0-9][0-9][0-9]) or +")


gen d2 = regexs(1) if regexm(application_date_std,"or +([0-9]+) ?([A-z][A-z][A-z])? ?([0-9][0-9][0-9][0-9])")
gen m2 = regexs(2) if regexm(application_date_std,"or +([0-9]+)? ?([A-z][A-z][A-z]) ?([0-9][0-9][0-9][0-9])")
gen y2 = regexs(3) if regexm(application_date_std,"or +([0-9]+)? ?([A-z][A-z][A-z])? ?([0-9][0-9][0-9][0-9])")


* Replace day, month, or year with ajacent one if missing
foreach x in "d" "m" "y" {
	replace `x'1 = `x'2 if mi(`x'1)
	replace `x'2 = `x'1 if mi(`x'2)
}

gen _temp = d1 + m1 + y1
replace int_date = date(_temp, "DMY") if !mi(d1,m1,y1)
drop _temp
gen _temp = d2 + m2 + y2
gen date2  = date(_temp, "DMY") if !mi(d2,m2,y2)
drop _temp

drop d1 d2 m1 m2 y1 y2

format int_date date2 %tdD_m_CY



*************************************
* Fix for obvious date entry errors *
*************************************

* Kill date2 if years are out of bounds
replace date2 = . if !inrange(year(date2),1865,1874) & !mi(date2)

* Replace "or" dates with closest date to last valid date
gen long fill_date = int_date
bys city block_id (acct_id): replace fill_date = fill_date[_n-1] if mi(fill_date)
format fill_date %tdD_m_CY
replace int_date = date2 if abs(int_date-fill_date) > abs(date2-fill_date) & !mi(date2)
drop date2


* Mark years which are out of bounds
gen out_of_bounds_yr = !inrange(year(int_date),1865,1874) & !mi(int_date)
replace fill_date = . if out_of_bounds_yr
bys city block_id (acct_id): replace fill_date = fill_date[_n-1] if mi(fill_date)
replace int_date = mdy(month(int_date),day(int_date),year(fill_date)) if out_of_bounds_yr
assert inrange(year(int_date),1865,1874) | mi(int_date)
drop out_of_bounds_yr

drop fill_date

*************************************************
* Generate longest ordered sequence with Python *
*************************************************

* Separately save all observations without missing dates and with missing dates
gen long date = int_date
gen byte missing_date = mi(date)
tempfile temp
save `temp'
use `temp' if !mi(int_date)
save working/nonmissing, replace
use `temp' if mi(int_date)
save working/missing, replace



use working/nonmissing

sort city block_id acct_id
by city block_id (acct_id): gen long date_count = _n



* Create a file for python to read
tempfile python_merge
save `python_merge'
format date %15.0f
outsheet block_id date date_count using "working/subseq.csv", non comma replace

* Call python program to extract the longest increasing subsequence for each roll. Users may need to edit the path to the python directory.
python:
from subseq import output_subseq
output_subseq()
end


insheet using "working/input.csv", non comma clear
rename v1 block_id
rename v2 date
rename v3 date_count
merge 1:1 block_id date date_count using `python_merge', assert(match using)
gen main_sequence = _merge==3
drop _merge  



* Create last and next valid date as a function of main sequence
gen last_date = date if main_sequence == 1
gen next_date = date if main_sequence == 1

sort city block_id acct_id
by city block_id: replace last_date = last_date[_n-1] if mi(last_date)
gsort city block_id -acct_id
by city block_id: replace next_date = next_date[_n-1] if mi(next_date)

* Generate modified_sequence to track fixes in sequence
gen byte modified_sequence = main_sequence

* Replace dates where the year is obviously wrong and replacing it with the previous year puts the date in sequence
* Replace with year of last date
replace date = mdy(month(date),day(date), year(last_date)) if modified_sequence==0 & inrange( mdy(month(date),day(date), year(last_date)),last_date,next_date)
replace modified_sequence = 1 if modified_sequence==0 & inrange( mdy(month(date),day(date), year(last_date)),last_date,next_date)
* Replace with year of next date
replace date = mdy(month(date),day(date), year(next_date)) if modified_sequence==0 & inrange( mdy(month(date),day(date), year(next_date)),last_date,next_date)
replace modified_sequence = 1 if modified_sequence==0 & inrange( mdy(month(date),day(date), year(next_date)),last_date,next_date)


* Replace dates where the month may be wrong and replacing it puts the date within a sequence of a week or less
replace date = mdy(month(last_date),day(date), year(date)) if modified_sequence==0 & inrange(mdy(month(last_date),day(date), year(date)),last_date,next_date) & next_date-last_date<=7
replace modified_sequence = 1 if modified_sequence==0 & inrange(mdy(month(last_date),day(date), year(date)),last_date,next_date) & next_date-last_date<=7



* Add back records with missing dates
append using working/missing

* Update sequence variables 
sort acct_id
replace main_sequence = 0 if missing_date==1
replace modified_sequence = 0 if missing_date==1


* Replace last_date and next_date with modified contiguous dates and run again
sort city block_id acct_id
by city block_id: replace last_date = last_date[_n-1] if mi(last_date)
gsort city block_id -acct_id
by city block_id: replace next_date = next_date[_n-1] if mi(next_date)


sort city block_id acct_id
* fill entries with missing dates with last and next dates from above and below
format date last_date next_date %td
by city block_id: assert next_date - last_date >=0  /* Might be missing */


*************************
* Fill sandwiched dates *
*************************
* If record is sandwiched between two identical dates, copy directly
* If date is sandwitched betweed indentical dates, tag it
gen byte date_sandwiched = last_date == next_date & mi(date)
replace date = last_date if last_date == next_date & mi(date)



*********************
* Interpolate dates *
*********************

sort city block_id acct_id

* If records are separated by more than one day, interpolate
gen date_interpolated = mi(date)

* Generate ID for each gap
gen long _temp = acct_id if mi(date)
by city block_id (acct_id): replace _temp = _temp[_n-1] if !mi(_temp,_temp[_n-1])
egen long gap_id = group(_temp)
drop _temp
egen gapsize = count(gap_id), by(gap_id)
bys gap_id (acct_id): gen long order_in_gap = _n

sort city block_id image_nbr acct_id
replace date = last_date + round((next_date-last_date)*order_in_gap/(gapsize+1)) if mi(date)


* Roll back interpolated Sundays to Saturday
replace date = date-1 if  date_interpolated & dow(date) == 0
format date %td


* Roll back ALL Sundays to Saturday
replace date = date-1 if dow(date) == 0

* This leaves a handfull of missing dates at the very beginning or end of the record.
by city block_id: egen _temp1 = min(image_nbr * !mi(date))
by city block_id: egen _temp2 = max(image_nbr * !mi(date))
* Check if all missing observations occur continguously at beginning or end of film roll
assert _temp1 < image_nbr | _temp2 > image_nbr if mi(date)
drop _temp1 _temp2

* Assign the first of last valid date, and note the issue
gen initial_records =       !mi(next_date) & mi(last_date) & mi(date)
replace date = next_date if !mi(next_date) & mi(last_date) & mi(date)
gen final_records =         !mi(last_date) & mi(next_date) & mi(date)
replace date = last_date if !mi(last_date) & mi(next_date) & mi(date)


gen new_account = referring_record==0

la var date "Account date (potentially interpolated)"
la var date_count "Date sequence in main sequence"
la var referring_record "Reference to prior record (new passbook)"
la var new_account "Record is a new account (not a transfer)"
la var last_date "Base date if date is interpolated"
la var next_date "Final date if date is interpolated"
la var int_date "Numberical translation from text account date"
la var main_sequence "Record is part of valid longest increasing sequence"
la var modified_sequence "Record is part of longest increasing sequence after error correction"
la var missing_date "Date was missing from original records"
la var date_sandwiched "Record was sandwitched between two identical dates"
la var date_interpolated "Date is interpolated between nonmissing sequential records"
la var gapsize "# of records in interpolated gap"
la var order_in_gap "Order of interpolated record"

save working/cleaned_fs_records, replace

use working/cleaned_fs_records
joinby city using working/valid_sequences, unmatched(both)
assert _merge==3
drop _merge
keep if inrange(date,beg_date, end_date)
qui duplicates report unique_identifier
assert r(unique_value) == r(N)

save working/cleaned_valid_sequential_records, replace












